About the data:

Series ID: GSE21933 Platform: Phalanx Human OneAray v5 GPL6254

Experiment type: Expression profiling by Array Details:

Phalanx Biotech Group’s Human OneArray v5 contains 32,048 features, 30968 detection probes and 1080 control probes, spotted onto glass slides using a proprietary non-contact printing method. Detection probes are annotated against the human genome and grouped into the following categories:

Lo FY, Chang JW, Chang IS, Chen YJ et al. The database of chromosome imbalance regions and genes resided in lung cancer from Asian and Caucasian identified by array-comparative genomic hybridization. BMC Cancer 2012 Jun 12;12:235. PMID: 22691236

Data and Normalization

data_id <- "GSE21933"

eset <-getGEO(data_id) #get dataset
## Found 1 file(s)
## GSE21933_series_matrix.txt.gz
gse <- eset[[1]] #get gene dataset

# head(pData(gse)) #phenotype data
# head(fData(gse)) #feature data
# head(exprs(gse)) #complete expression data set

summary(exprs(gse)) #statistical analysis of all summary()
##    GSM545615         GSM545616         GSM545617          GSM545618      
##  Min.   : 0.1536   Min.   : 0.4077   Min.   : 0.04938   Min.   : 0.5312  
##  1st Qu.: 3.8381   1st Qu.: 3.8453   1st Qu.: 3.84864   1st Qu.: 3.8799  
##  Median : 5.8976   Median : 5.8950   Median : 5.89495   Median : 5.8941  
##  Mean   : 6.2761   Mean   : 6.2757   Mean   : 6.27665   Mean   : 6.2752  
##  3rd Qu.: 8.4292   3rd Qu.: 8.4269   3rd Qu.: 8.42835   3rd Qu.: 8.4281  
##  Max.   :15.9989   Max.   :15.9989   Max.   :15.99895   Max.   :15.9989  
##    GSM545619        GSM545620         GSM545621         GSM545622       
##  Min.   : 0.283   Min.   : 0.3436   Min.   : 0.1726   Min.   : 0.01868  
##  1st Qu.: 3.819   1st Qu.: 3.8353   1st Qu.: 3.8568   1st Qu.: 3.84881  
##  Median : 5.894   Median : 5.8965   Median : 5.8949   Median : 5.89586  
##  Mean   : 6.277   Mean   : 6.2760   Mean   : 6.2766   Mean   : 6.27655  
##  3rd Qu.: 8.430   3rd Qu.: 8.4286   3rd Qu.: 8.4292   3rd Qu.: 8.42895  
##  Max.   :15.999   Max.   :15.9989   Max.   :15.9989   Max.   :15.99893  
##    GSM545623         GSM545624         GSM545625           GSM545626     
##  Min.   : 0.2089   Min.   : 0.2229   Min.   : 0.008928   Min.   : 0.039  
##  1st Qu.: 3.8365   1st Qu.: 3.8332   1st Qu.: 3.852787   1st Qu.: 3.845  
##  Median : 5.8907   Median : 5.8982   Median : 5.895908   Median : 5.896  
##  Mean   : 6.2757   Mean   : 6.2759   Mean   : 6.275721   Mean   : 6.276  
##  3rd Qu.: 8.4298   3rd Qu.: 8.4313   3rd Qu.: 8.429130   3rd Qu.: 8.428  
##  Max.   :15.9989   Max.   :15.9989   Max.   :15.998947   Max.   :15.999  
##    GSM545627         GSM545628          GSM545629         GSM545630      
##  Min.   : 0.0598   Min.   : 0.01137   Min.   : 0.2113   Min.   : 0.1679  
##  1st Qu.: 3.8114   1st Qu.: 3.82077   1st Qu.: 3.8270   1st Qu.: 3.8655  
##  Median : 5.9008   Median : 5.89057   Median : 5.8864   Median : 5.8968  
##  Mean   : 6.2769   Mean   : 6.27676   Mean   : 6.2747   Mean   : 6.2761  
##  3rd Qu.: 8.4302   3rd Qu.: 8.43115   3rd Qu.: 8.4314   3rd Qu.: 8.4304  
##  Max.   :15.9989   Max.   :15.99893   Max.   :15.9989   Max.   :15.9989  
##    GSM545631          GSM545632        GSM545633          GSM545634        
##  Min.   : 0.02635   Min.   : 0.260   Min.   : 0.05111   Min.   : 0.009472  
##  1st Qu.: 3.83963   1st Qu.: 3.867   1st Qu.: 3.85254   1st Qu.: 3.840950  
##  Median : 5.89913   Median : 5.900   Median : 5.89552   Median : 5.894485  
##  Mean   : 6.27604   Mean   : 6.275   Mean   : 6.27769   Mean   : 6.277425  
##  3rd Qu.: 8.42879   3rd Qu.: 8.428   3rd Qu.: 8.43166   3rd Qu.: 8.430225  
##  Max.   :15.99893   Max.   :15.999   Max.   :15.99890   Max.   :15.998947  
##    GSM545635        GSM545636        GSM545637         GSM545638      
##  Min.   : 0.000   Min.   : 0.523   Min.   : 0.3539   Min.   : 0.3566  
##  1st Qu.: 3.844   1st Qu.: 3.834   1st Qu.: 3.8428   1st Qu.: 3.8392  
##  Median : 5.894   Median : 5.889   Median : 5.9023   Median : 5.8975  
##  Mean   : 6.277   Mean   : 6.273   Mean   : 6.2740   Mean   : 6.2766  
##  3rd Qu.: 8.429   3rd Qu.: 8.427   3rd Qu.: 8.4290   3rd Qu.: 8.4297  
##  Max.   :15.999   Max.   :15.999   Max.   :15.9989   Max.   :15.9989  
##    GSM545639         GSM545640          GSM545641         GSM545642       
##  Min.   : 0.2665   Min.   : 0.08381   Min.   : 0.4029   Min.   : 0.05743  
##  1st Qu.: 3.8352   1st Qu.: 3.87108   1st Qu.: 3.8324   1st Qu.: 3.83935  
##  Median : 5.8931   Median : 5.89739   Median : 5.8930   Median : 5.89279  
##  Mean   : 6.2754   Mean   : 6.27513   Mean   : 6.2761   Mean   : 6.27641  
##  3rd Qu.: 8.4288   3rd Qu.: 8.43237   3rd Qu.: 8.4306   3rd Qu.: 8.43054  
##  Max.   :15.9989   Max.   :15.99895   Max.   :15.9989   Max.   :15.99893  
##    GSM545643         GSM545644         GSM545645          GSM545646      
##  Min.   : 0.3071   Min.   : 0.4294   Min.   : 0.03132   Min.   : 0.3196  
##  1st Qu.: 3.8600   1st Qu.: 3.8814   1st Qu.: 3.84646   1st Qu.: 3.8684  
##  Median : 5.8883   Median : 5.8968   Median : 5.89639   Median : 5.9005  
##  Mean   : 6.2759   Mean   : 6.2753   Mean   : 6.27705   Mean   : 6.2748  
##  3rd Qu.: 8.4299   3rd Qu.: 8.4290   3rd Qu.: 8.42751   3rd Qu.: 8.4301  
##  Max.   :15.9989   Max.   :15.9989   Max.   :15.99895   Max.   :15.9989  
##    GSM545647         GSM545648           GSM545649         GSM545650      
##  Min.   : 0.1531   Min.   : 0.008351   Min.   : 0.1819   Min.   : 0.3828  
##  1st Qu.: 3.8076   1st Qu.: 3.839472   1st Qu.: 3.8272   1st Qu.: 3.8240  
##  Median : 5.8911   Median : 5.896806   Median : 5.8958   Median : 5.8992  
##  Mean   : 6.2754   Mean   : 6.277289   Mean   : 6.2767   Mean   : 6.2751  
##  3rd Qu.: 8.4308   3rd Qu.: 8.430428   3rd Qu.: 8.4304   3rd Qu.: 8.4293  
##  Max.   :15.9989   Max.   :15.998947   Max.   :15.9989   Max.   :15.9989  
##    GSM545651         GSM545652          GSM545653         GSM545654       
##  Min.   : 0.1291   Min.   : 0.07773   Min.   : 0.3255   Min.   : 0.01947  
##  1st Qu.: 3.8631   1st Qu.: 3.84617   1st Qu.: 3.8466   1st Qu.: 3.85324  
##  Median : 5.8989   Median : 5.89233   Median : 5.8969   Median : 5.89586  
##  Mean   : 6.2770   Mean   : 6.27683   Mean   : 6.2754   Mean   : 6.27742  
##  3rd Qu.: 8.4321   3rd Qu.: 8.43029   3rd Qu.: 8.4304   3rd Qu.: 8.43093  
##  Max.   :15.9989   Max.   :15.99895   Max.   :15.9989   Max.   :15.99893  
##    GSM545655           GSM545656       
##  Min.   : 0.005867   Min.   : 0.04854  
##  1st Qu.: 3.838326   1st Qu.: 3.83994  
##  Median : 5.897306   Median : 5.89339  
##  Mean   : 6.276177   Mean   : 6.27702  
##  3rd Qu.: 8.429420   3rd Qu.: 8.42844  
##  Max.   :15.998928   Max.   :15.99893

The Phenotype Data contains 42 samples and 40 features for the samples. The Feature Data: The feature data contains 30967 probes and 10 features for each samples.

Looking at the summary of expressions, the values for each sample falls between 0-16. Suggesting that these values a log2 normalized. Further, the figure below shows a boxplot of all the gene count values for each sample. As seen below, all the samples have lined up unifromly suggesting that the data is already log normalized.

boxplot(exprs(gse), outline = FALSE, col = "gold")

Inspecting clinical Data

There are several columns within the metadata many of these are repeating. The most important of these columns are the column-1 - title, and columns 36 to 40 age:ch1,histology:ch1,sex:ch1,stage:ch1,tissue:ch1. Further, most of the column names have a “:ch1” at the end. This was removed. Lastly, the histology and stage columns have no data for negative samples. This was changed to “neg”.

#Preparing sample metadata
samplesinfo <- pData(gse) #gettign sample data
samplesinfo <- samplesinfo[,c(1,36:40)] #There are multiple columns for same data, acquiring relevant metadata information.
colnames(samplesinfo) <- gsub(":ch1","",colnames(samplesinfo))

samplesinfo <- samplesinfo%>%
  mutate(across(c(3,5),~replace_na(.,"Neg")))

samplesinfo <- samplesinfo%>%
  mutate_all(~gsub(" years","",.))%>%
  mutate(across(c(histology,sex,stage,tissue),factor))%>%
  mutate(age = as.numeric(age))%>%
  mutate(tissue = ifelse(tissue == "primary normal lung tissue","normal","tumor"))

samplesinfo$histology <- relevel(samplesinfo$histology, ref = "Neg")
samplesinfo$stage <- relevel(samplesinfo$stage, ref = "Neg")
samplesinfo$tissue <- relevel(as.factor(samplesinfo$tissue), ref = "normal")

#Prepare Features Metadata 
featuresinfo <- fData(gse)
#Prepare expression data
exprs_data <- exprs(gse)

The data set represents 42 lung tissue samples, 21 primary normal lung tissues and 21 primary lung tumor tissues. The tumor tissues are representing six stages of lung cancer IA,IB,IIB,IIIA,IIIB,IV. Further two hematological patterns are also represented in tumor samples, these are: adenocarcinoma (AD) and Squamous cell carcinoma (SQ).

samplesinfo%>%
  group_by(tissue, stage,histology)%>%
  tally()%>%
  spread(histology,n)

Clustering and PCA analysis

corMatrix <- cor(exprs_data, use = "c")
rownames(samplesinfo) <- colnames(corMatrix)
pheatmap(corMatrix, annotation_col =samplesinfo[,3:6], annotation_row = samplesinfo[,3:6], cluster_rows = T, cluster_cols = T)

#Hierarchical cluster based dimension analysis
htree <- hclust(dist(t(exprs_data)), method = "average")
plot(htree) #It seems there is a specific pattern of clustering within samples. And this might be biologically relevant difference so I think it is better to makes sure we check with PCA,the variance cotnribution. 

pca_gse <- prcomp(t(exprs_data)) # Run PCA analysis

screeplot(pca_gse, npcs=min(10,length(pca_gse$sdev)),type = c("barplot","lines")) #Screeplots of all the PC;s and their cotnribution. 

# PCA plot by tissue
p1 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=tissue, label = paste(tissue)))+
  geom_point(size=5)+
  geom_text_repel()
  

p2 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=stage, label = paste(stage)))+
  geom_point()+
  geom_text_repel()

p3 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=histology, label = paste(histology)))+
  geom_point()+
  geom_text_repel()

p4 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=title, label = paste(title)))+
  geom_point(size = 5)+
  geom_text_repel(size = 5)+
  ggsci::scale_fill_nejm()


ggarrange(p1,p2,p3,p4, ncol = 2, nrow = 2, common.legend = TRUE, legend = "bottom")

Filter dataset:

First I am filtering out only genes that belong to group-6 in the array as that represents the control genes. Lowly expressed genes must be filtered out as they can deviate the results of gene expression. The cut off for low expression is median gene expression of each sample. Next, I am applying three seperate strategies: 1) All samples have a data 2) At least 50% samples have all the data 3) At least 25% of sample have a data. However, this is a microarray dataset.

Exploring cut offs:

control_list <- rownames(gse@featureData[which(gse@featureData@data$GROUP ==6)]) #create a vector list of controls
control_matrix <- exprs_data[rownames(exprs_data)%in% control_list,] #isolate a matrix of just controls

mean_control <- colMeans(control_matrix)
mean_control <- data.frame(mean_control)%>%
  rownames_to_column(var = "control")
colnames(mean_control) <- c("controls","mean")

cat(paste0("Mean of Controls= ",mean(control_matrix),"\n","Mean of data= ",mean(exprs_data),"\n", "Median of Controls= ",median(control_matrix),"\n","Median of data= ",median(exprs_data)))
## Mean of Controls= 8.31688252769473
## Mean of data= 6.27604254788615
## Median of Controls= 7.96752265
## Median of data= 5.894992
cutoff <- median(exprs_data) #take median of expression dataset
is_expressed <- exprs_data>cutoff

keep_all<- rowSums(is_expressed) >=42 #stringent should be present in all samples.
keep_50 <- rowSums(is_expressed)>=21 #should be present in at leasst 50% samples.
keep <- rowSums(is_expressed)<10 # lenient threshold present in at least 25% samples

gse_filt <- rbind(table(keep),table(keep_50),table(keep_all))
rownames(gse_filt) <- c("25%","50%","100%")
gse_filt
##      FALSE  TRUE
## 25%  17877 13090
## 50%  15596 15371
## 100% 22070  8897
gse_25 <- gse[keep,]
gse_50 <- gse[keep_50,]
gse_100 <- gse[keep_all,]

dim_table <- data.frame(SampleCount = c("25%","50%","100%"),
                        features = c(dim(gse_25)[1],dim(gse_50)[1],dim(gse_100)[1]),
                        features = c(dim(gse_25)[2],dim(gse_50)[2],dim(gse_100)[2]))

print(dim_table)
##   SampleCount features features.1
## 1         25%    13090         42
## 2         50%    15371         42
## 3        100%     8897         42
plot(density(control_matrix), main = "Expression Density", xlab = "log2 intensity",)
abline(v = c(1,6), col = c("red","blue"), lty = 2)  # typical cutoff

Filter genes:

Differential Expression

Combined effect and interaction effect

set.seed(1234)
design_combined <- model.matrix(~0+tissue+ histology+stage, data = samplesinfo)
design_interaction <- model.matrix(~tissue*histology*stage, data = samplesinfo)

AD versus SQ versus neg

design_gse <- model.matrix(~0+samplesinfo$histology)
colnames(design_gse)<-c("neg","AD","SQ")

fit <- lmFit(exprs(gse_100),design_gse) #fit all the genes
contrasts_gse <- makeContrasts( ADvsneg = AD-neg, #AD versus negative
                                SQvsneg = SQ-neg, #SQ versus negative
                                ADvsSQ = AD-SQ, #AD versus SQ
                                SQvsAD = SQ-AD, #SQ versus AD
                                avsavg = (AD+SQ)/2-neg, #negative versus combined disease how significantly different SQ and AD are from negative.
                                ADvsSQtoneg = ((AD-neg)-(SQ-neg)), #Checks if AD is significantly more different than SQ
                                levels =design_gse)
cont_names <- colnames(contrasts_gse)

result_all <- data.frame()
for (i in cont_names){
  fit_contrast <- contrasts.fit(fit, contrasts_gse[,i])
  fit_contrast <- eBayes(fit_contrast)
  results <- topTable(fit_contrast, adjust="fdr", number=Inf)
  results$Genes <- rownames(results)
  results$Contrasts<- i
  result_all <- bind_rows(result_all,results)
  
  plot <-ggplot(results, aes(x = logFC, y = -log10(adj.P.Val))) +
    geom_point(aes(color = adj.P.Val < 0.05), alpha = 0.5) +
    scale_color_manual(values = c("black", "red")) +
    labs(title = paste("Volcano Plot:", i), x = "Log2 Fold Change", y = "-Log10 Adjusted P-value") +
    theme_minimal()
  print(plot)
  ggsave(filename = paste0(i,"_volcano.png"), path = "../figures")
  
}

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image
#volcano plot
ggplot(result_all, aes(x = logFC, y = -log10(adj.P.Val), color = Contrasts)) +
  geom_point(alpha = 0.5) +
  labs(title = "Volcano Plot for Multiple Contrasts", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value") +
  theme_minimal()

ggsave(filename = "volcano_all_contrasts.png",path = "../figures")
## Saving 7 x 5 in image

WGCNA

gsg <- goodSamplesGenes(t(exprs_data)) #explore data for WGCNA to ensure all genes and columns are good. 
##  Flagging genes and samples with too many missing values...
##   ..step 1
print(paste0("All samples are looking good: "))
## [1] "All samples are looking good: "
head(gsg$allOK) #IF TRUE all the values are OK. 
## [1] TRUE
head(gsg$goodGenes) #All genes are OK. 
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
head(gsg$goodSamples) # all samples are OK (42 of 42)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE

Chose Threshold

powers <- c(1:50)

sft <- pickSoftThreshold(t(exprs_data), powerVector = powers, verbose = 5) #analyze power at multiple threshold values
## pickSoftThreshold: will use block size 1444.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 1444 of 30967
## Warning: executing %dopar% sequentially: no parallel backend registered
##    ..working on genes 1445 through 2888 of 30967
##    ..working on genes 2889 through 4332 of 30967
##    ..working on genes 4333 through 5776 of 30967
##    ..working on genes 5777 through 7220 of 30967
##    ..working on genes 7221 through 8664 of 30967
##    ..working on genes 8665 through 10108 of 30967
##    ..working on genes 10109 through 11552 of 30967
##    ..working on genes 11553 through 12996 of 30967
##    ..working on genes 12997 through 14440 of 30967
##    ..working on genes 14441 through 15884 of 30967
##    ..working on genes 15885 through 17328 of 30967
##    ..working on genes 17329 through 18772 of 30967
##    ..working on genes 18773 through 20216 of 30967
##    ..working on genes 20217 through 21660 of 30967
##    ..working on genes 21661 through 23104 of 30967
##    ..working on genes 23105 through 24548 of 30967
##    ..working on genes 24549 through 25992 of 30967
##    ..working on genes 25993 through 27436 of 30967
##    ..working on genes 27437 through 28880 of 30967
##    ..working on genes 28881 through 30324 of 30967
##    ..working on genes 30325 through 30967 of 30967
##    Power SFT.R.sq  slope truncated.R.sq  mean.k. median.k.  max.k.
## 1      1   0.0123  0.283          0.846 6.42e+03  6.36e+03 10200.0
## 2      2   0.3370 -1.010          0.820 2.12e+03  1.96e+03  4950.0
## 3      3   0.5870 -1.380          0.850 8.90e+02  7.33e+02  2850.0
## 4      4   0.6860 -1.500          0.877 4.37e+02  3.10e+02  1820.0
## 5      5   0.7390 -1.520          0.895 2.40e+02  1.43e+02  1230.0
## 6      6   0.7980 -1.480          0.923 1.42e+02  7.06e+01   875.0
## 7      7   0.8380 -1.470          0.946 8.98e+01  3.66e+01   652.0
## 8      8   0.8340 -1.540          0.950 5.96e+01  1.98e+01   524.0
## 9      9   0.8330 -1.610          0.959 4.11e+01  1.11e+01   435.0
## 10    10   0.8490 -1.640          0.974 2.92e+01  6.44e+00   368.0
## 11    11   0.8630 -1.660          0.984 2.14e+01  3.84e+00   315.0
## 12    12   0.8770 -1.660          0.989 1.60e+01  2.35e+00   272.0
## 13    13   0.8860 -1.660          0.992 1.22e+01  1.47e+00   238.0
## 14    14   0.8950 -1.660          0.990 9.49e+00  9.43e-01   209.0
## 15    15   0.9030 -1.650          0.991 7.48e+00  6.13e-01   184.0
## 16    16   0.9110 -1.640          0.995 5.97e+00  4.04e-01   164.0
## 17    17   0.9210 -1.630          0.997 4.83e+00  2.69e-01   146.0
## 18    18   0.9220 -1.630          0.996 3.94e+00  1.82e-01   131.0
## 19    19   0.9260 -1.620          0.996 3.25e+00  1.24e-01   118.0
## 20    20   0.9280 -1.600          0.994 2.71e+00  8.48e-02   107.0
## 21    21   0.9320 -1.590          0.994 2.27e+00  5.87e-02    96.6
## 22    22   0.9390 -1.580          0.995 1.91e+00  4.11e-02    87.9
## 23    23   0.9420 -1.570          0.996 1.63e+00  2.91e-02    80.4
## 24    24   0.9460 -1.560          0.995 1.39e+00  2.08e-02    73.6
## 25    25   0.9480 -1.550          0.996 1.20e+00  1.49e-02    67.6
## 26    26   0.9500 -1.540          0.996 1.03e+00  1.07e-02    62.1
## 27    27   0.9490 -1.540          0.992 8.99e-01  7.79e-03    57.2
## 28    28   0.9490 -1.530          0.991 7.84e-01  5.69e-03    52.8
## 29    29   0.9510 -1.520          0.989 6.87e-01  4.16e-03    48.8
## 30    30   0.9510 -1.510          0.988 6.05e-01  3.05e-03    45.2
## 31    31   0.9510 -1.500          0.987 5.34e-01  2.24e-03    41.9
## 32    32   0.9510 -1.500          0.986 4.73e-01  1.66e-03    38.9
## 33    33   0.9500 -1.490          0.984 4.21e-01  1.23e-03    36.1
## 34    34   0.9530 -1.490          0.986 3.76e-01  9.16e-04    33.6
## 35    35   0.9510 -1.490          0.987 3.36e-01  6.83e-04    31.3
## 36    36   0.9530 -1.480          0.988 3.01e-01  5.10e-04    29.2
## 37    37   0.9590 -1.470          0.994 2.71e-01  3.82e-04    27.3
## 38    38   0.9600 -1.460          0.993 2.45e-01  2.87e-04    25.5
## 39    39   0.9640 -1.460          0.997 2.21e-01  2.16e-04    23.8
## 40    40   0.9650 -1.450          0.998 2.00e-01  1.64e-04    22.3
## 41    41   0.9660 -1.450          0.998 1.82e-01  1.24e-04    20.9
## 42    42   0.9670 -1.440          0.998 1.66e-01  9.40e-05    19.6
## 43    43   0.9680 -1.430          0.998 1.51e-01  7.11e-05    18.4
## 44    44   0.9690 -1.430          0.997 1.38e-01  5.42e-05    17.3
## 45    45   0.9690 -1.420          0.995 1.27e-01  4.14e-05    16.2
## 46    46   0.9700 -1.420          0.995 1.16e-01  3.15e-05    15.3
## 47    47   0.9650 -1.420          0.989 1.07e-01  2.41e-05    14.5
## 48    48   0.9600 -1.420          0.983 9.83e-02  1.84e-05    13.7
## 49    49   0.9670 -1.410          0.991 9.06e-02  1.42e-05    12.9
## 50    50   0.9660 -1.410          0.988 8.37e-02  1.08e-05    12.3
#plot to choose an optimal power

# Extract data

sft_df <- sft$fitIndices%>%
  select(Power = Power,
                SFT_R2 = SFT.R.sq,
                Slope = slope,
                MeanConnectivity = mean.k.)
sft_df <- sft_df%>%
  mutate(SignedR2 = -sign(Slope)*SFT_R2)


#Scale-Free Topology Index plot 
sfree <- ggplot(sft_df,aes(x = Power, y = SignedR2, label = Power))+
  geom_point(color= "#FF0099",size = 3)+
  geom_text(vjust = 1.5, size = 3.5)+
  geom_hline(yintercept = c(0.8,0.9), linetype= "dashed", color = c("#300666","#300999"),linewidth=1)+
  geom_vline(xintercept = 15, linetype= "dashed", color = "red",linewidth=0.6)+
  labs(title = "Scale-Free Topology Fit Index", y= bquote(paste("Signed ",R^2)))+
  theme_pubr()

#Mean connectivity plot

mfree <- ggplot(sft_df, aes(x = Power, y = MeanConnectivity, label=Power))+
  geom_point(color = "#FF0099", size = 3)+
  geom_text(vjust=-1, size=3.5)+
  labs(title = "Mean Connectivity", y = "Mean Connectivity", x = "Soft Threshold") +
  geom_hline(yintercept = 100, linetype= "dashed", color = "#300999",linewidth=1)+
  geom_vline(xintercept = 15, linetype= "dashed", color = "red",linewidth=0.6)+
  theme_pubr()

ggarrange(plotlist = list(sfree,mfree), nrow = 2,ncol = 1)

ggsave(filename="WGCNA_Threshold.png", path = "../figures", dpi = 600, width = 20, height = 10, units = "in" )

chosen_threshold <- sft_df$Power[12]
mean_conectivity <- sft_df$MeanConnectivity[chosen_threshold]
scale_free_top <- sft_df$SFT_R2[chosen_threshold]

I chose the threshold to be at 15 as this had a mean connectivity of at scale free topology index of

softPower <- 11

cat("The soft power is",softPower)
## The soft power is 11
adj_matrix <- t(exprs_data) #filtered genes present in all samples
adjacency <- adjacency(adj_matrix, power = softPower)

# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency)
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM <- 1 - TOM

# Hierarchical clustering
geneTree <- hclust(as.dist(dissTOM), method = "average")
plot(geneTree, main = "Gene clustering")

# Module identification using dynamic tree cut
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
                             deepSplit = 2, pamRespectsDendro = FALSE,
                             minClusterSize = 30,
                             cutHeight = 0.25)
##  cutHeight set too low: no merges below the cut.
cat("DYnamic tree with modules:")
## DYnamic tree with modules:
table(dynamicMods)
## dynamicMods
##     0 
## 30967
# Assign module colors
dynamicColors <- labels2colors(dynamicMods)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

cat("DYnamic tree modules with colors:")
## DYnamic tree modules with colors:
table(dynamicColors)
## dynamicColors
##  grey 
## 30967
# STEP-4: Relate Modules to Traits
# Calculate eigengenes
MEs <- moduleEigengenes(adj_matrix, colors = dynamicColors)$eigengenes
MEs <- orderMEs(MEs)

# Correlate eigengenes with traits
traits <- samplesinfo %>%
  mutate(across(where(is.factor), ~as.numeric(as.factor(.)))) %>%
  select(-c("title","sex"))  # Remove non-trait columns like ID if present

#traits <- model.matrix(~ histology + sex + stage + tissue + age, data = samplesinfo)[,-1]


moduleTraitCor <- cor(MEs,traits, use = "p")
moduleTraitPvalue <- corPvalueStudent(moduleTraitCor, nSamples = ncol(adj_matrix))

all(rownames(MEs) == rownames(traits))  # must be TRUE
## [1] FALSE
# Plot correlation heatmap
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(traits),
               yLabels = names(MEs),
               colorLabels = FALSE,
               colors = blueWhiteRed(50),
               textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                                  signif(moduleTraitPvalue, 1), ")", sep = ""),
               main = "Module-Trait Relationships")

# STEP-5: GET Hub genes from significant moduels
# Choose a module of interest (e.g. "blue")
module <- "blue"
module_genes <- colnames(adj_matrix)[which(dynamicColors == module)]

module_genes <- colnames(dynamicColors)[which(dynamicColors == module)]

# Check that this returns genes
length(module_genes)  # Should be 2466
## [1] 0
head(module_genes)    # Should show gene names
## NULL
# Calculate intra-modular connectivity (hubness)
kWithin <- intramodularConnectivity(adjacency, dynamicColors)

# Order genes by connectivity
hub_genes <- module_genes[order(kWithin[module_genes, "kWithin"], decreasing = TRUE)]
head(hub_genes, 10)  # Top 10 hub genes
## NULL

# tutorial: https://sbc.shef.ac.uk/geo_tutorial/tutorial.nb.html